
from pyspark.sql import SparkSession,SQLContext


# windows下需要此代码

# 创建spark环境:sparkSession
spark = SparkSession.builder.appName("test").getOrCreate()

spark.read.parquet("s3://ods/ac_kyd/p_date=2019-06/part-00004-76b2b753-2a9e-4a5c-8807-332da823eac4.c000.snappy.parquet").limit(100).show(100,False)

df = spark.read.parquet("s3://ods/ac_kyd/p_date=2019-06/part-00004-76b2b753-2a9e-4a5c-8807-332da823eac4.c000.snappy.parquet")
rows = spark.sql("").collect()
materials = []
for row in rows:
    materials.append(row[0])

df.write.option("header","true").mode("Overwrite").parquet("s3://batch/emr/ljc/")
# 停止spark环境
spark.stop()

